df <- read.xls('../data/Data_Cortex_Nuclear.xls')
df <- df %>%
mutate(Genotype = factor(Genotype), Treatment = factor(Treatment),
Behavior = factor(Behavior), class = factor(class))
In this dataset is 1080 mice
We can group mice by:
Balance of these groups:
| Genotype | Balance |
|---|---|
| Control | 0.528 |
| Ts65Dn | 0.472 |
| Treatment | Balance |
|---|---|
| Memantine | 0.528 |
| Saline | 0.472 |
| Behavior | Balance |
|---|---|
| C/S | 0.486 |
| S/C | 0.514 |
| class | Balance |
|---|---|
| c-CS-m | 0.139 |
| c-CS-s | 0.125 |
| c-SC-m | 0.139 |
| c-SC-s | 0.125 |
| t-CS-m | 0.125 |
| t-CS-s | 0.097 |
| t-SC-m | 0.125 |
| t-SC-s | 0.125 |
552 observations is fully complete. If we want to know complete observations by each feature:
colSums(!is.na(df))
## MouseID DYRK1A_N ITSN1_N BDNF_N NR1_N
## 1080 1077 1077 1077 1077
## NR2A_N pAKT_N pBRAF_N pCAMKII_N pCREB_N
## 1077 1077 1077 1077 1077
## pELK_N pERK_N pJNK_N PKCA_N pMEK_N
## 1077 1077 1077 1077 1077
## pNR1_N pNR2A_N pNR2B_N pPKCAB_N pRSK_N
## 1077 1077 1077 1077 1077
## AKT_N BRAF_N CAMKII_N CREB_N ELK_N
## 1077 1077 1077 1077 1062
## ERK_N GSK3B_N JNK_N MEK_N TRKA_N
## 1077 1077 1077 1073 1077
## RSK_N APP_N Bcatenin_N SOD1_N MTOR_N
## 1077 1077 1062 1077 1077
## P38_N pMTOR_N DSCR1_N AMPKA_N NR2B_N
## 1077 1077 1077 1077 1077
## pNUMB_N RAPTOR_N TIAM1_N pP70S6_N NUMB_N
## 1077 1077 1077 1077 1080
## P70S6_N pGSK3B_N pPKCG_N CDK5_N S6_N
## 1080 1080 1080 1080 1080
## ADARB1_N AcetylH3K9_N RRP1_N BAX_N ARC_N
## 1080 1080 1080 1080 1080
## ERBB4_N nNOS_N Tau_N GFAP_N GluR3_N
## 1080 1080 1080 1080 1080
## GluR4_N IL1B_N P3525_N pCASP9_N PSD95_N
## 1080 1080 1080 1080 1080
## SNCA_N Ubiquitin_N pGSK3B_Tyr216_N SHH_N BAD_N
## 1080 1080 1080 1080 867
## BCL2_N pS6_N pCFOS_N SYP_N H3AcK18_N
## 795 1080 1005 1080 900
## EGR1_N H3MeK4_N CaNA_N Genotype Treatment
## 870 810 1080 1080 1080
## Behavior class
## 1080 1080
data_summary <- function(data, varname, groupnames){
summary_func <- function(x, col){
c(mean = mean(x[[col]], na.rm=TRUE),
sd = sd(x[[col]], na.rm=TRUE))
}
data_sum <- plyr::ddply(data, groupnames, .fun=summary_func,
varname)
data_sum <- plyr::rename(data_sum, c("mean" = varname))
return(data_sum)
}
dff <- data_summary(df, 'BDNF_N', 'class')
ggplot(dff, aes(class, BDNF_N)) +
geom_line() +
geom_pointrange(aes(ymin=BDNF_N-sd, ymax=BDNF_N+sd)) +
ggtitle('BDNF_N expression by group')
aov_test <- summary(aov(BDNF_N ~ class, df))
BDNF_N expression significantly depends on class (F = 18.816, p value = 8.857^{-24}, df 1 = 7, df 2 = 1069).
proteins <- df %>%
na.omit() %>%
select(where(is.numeric))
fit_full <- lm(ERBB4_N ~ ., proteins)
fit_null <- lm(ERBB4_N ~ 1, proteins)
scope = list(lower = fit_null, upper = fit_full)
optimal_fit <- step(fit_null, scope = scope, direction = "forward", trace = FALSE)
lm_summary <- summary(optimal_fit)
lm_summary
##
## Call:
## lm(formula = ERBB4_N ~ ARC_N + pGSK3B_Tyr216_N + AcetylH3K9_N +
## SYP_N + IL1B_N + TRKA_N + BAX_N + pGSK3B_N + BAD_N + BRAF_N +
## DSCR1_N + P3525_N + PSD95_N + GluR4_N + ERK_N + pAKT_N +
## pBRAF_N + MTOR_N + NR1_N + pNR2B_N + pMTOR_N + NR2B_N + DYRK1A_N +
## Tau_N + pCREB_N + SHH_N + pCASP9_N + pJNK_N + pERK_N + pCAMKII_N +
## pPKCG_N + pPKCAB_N + ADARB1_N + BDNF_N + RRP1_N + S6_N, data = proteins)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.02238 -0.00324 -0.00021 0.00322 0.03189
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.008659 0.005377 1.61 0.10795
## ARC_N 0.145701 0.050472 2.89 0.00406 **
## pGSK3B_Tyr216_N 0.026663 0.007038 3.79 0.00017 ***
## AcetylH3K9_N 0.006171 0.004449 1.39 0.16601
## SYP_N 0.018280 0.008131 2.25 0.02499 *
## IL1B_N 0.029177 0.008189 3.56 0.00040 ***
## TRKA_N -0.000138 0.011409 -0.01 0.99037
## BAX_N -0.149413 0.026642 -5.61 3.3e-08 ***
## pGSK3B_N 0.154068 0.027653 5.57 4.1e-08 ***
## BAD_N -0.037307 0.021855 -1.71 0.08842 .
## BRAF_N 0.020529 0.008539 2.40 0.01656 *
## DSCR1_N -0.014621 0.007513 -1.95 0.05219 .
## P3525_N 0.065503 0.018012 3.64 0.00030 ***
## PSD95_N 0.023259 0.002641 8.81 < 2e-16 ***
## GluR4_N -0.101521 0.022379 -4.54 7.1e-06 ***
## ERK_N 0.005924 0.001636 3.62 0.00032 ***
## pAKT_N 0.075043 0.020293 3.70 0.00024 ***
## pBRAF_N -0.048064 0.026807 -1.79 0.07356 .
## MTOR_N 0.037294 0.011647 3.20 0.00145 **
## NR1_N -0.019723 0.003809 -5.18 3.2e-07 ***
## pNR2B_N 0.017637 0.004804 3.67 0.00027 ***
## pMTOR_N -0.007278 0.006338 -1.15 0.25133
## NR2B_N 0.011634 0.007566 1.54 0.12471
## DYRK1A_N -0.006803 0.005693 -1.19 0.23267
## Tau_N 0.066630 0.014181 4.70 3.4e-06 ***
## pCREB_N -0.063442 0.019910 -3.19 0.00153 **
## SHH_N 0.041662 0.015214 2.74 0.00639 **
## pCASP9_N 0.008525 0.002296 3.71 0.00023 ***
## pJNK_N -0.040549 0.016700 -2.43 0.01552 *
## pERK_N -0.017658 0.005448 -3.24 0.00127 **
## pCAMKII_N 0.001064 0.000390 2.73 0.00662 **
## pPKCG_N -0.006362 0.001525 -4.17 3.5e-05 ***
## pPKCAB_N 0.005765 0.001802 3.20 0.00146 **
## ADARB1_N -0.002210 0.001349 -1.64 0.10205
## BDNF_N 0.036349 0.016161 2.25 0.02492 *
## RRP1_N -0.014890 0.009302 -1.60 0.11005
## S6_N 0.006740 0.004459 1.51 0.13129
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.00578 on 515 degrees of freedom
## Multiple R-squared: 0.858, Adjusted R-squared: 0.848
## F-statistic: 86.5 on 36 and 515 DF, p-value: <2e-16
Adjusted R-squared is 0.848, df is 515 with 37 predictors. So, this is a just good model, but not excellent.
Let’s compare full model and optimal model:
anova(fit_full, optimal_fit)
## Analysis of Variance Table
##
## Model 1: ERBB4_N ~ DYRK1A_N + ITSN1_N + BDNF_N + NR1_N + NR2A_N + pAKT_N +
## pBRAF_N + pCAMKII_N + pCREB_N + pELK_N + pERK_N + pJNK_N +
## PKCA_N + pMEK_N + pNR1_N + pNR2A_N + pNR2B_N + pPKCAB_N +
## pRSK_N + AKT_N + BRAF_N + CAMKII_N + CREB_N + ELK_N + ERK_N +
## GSK3B_N + JNK_N + MEK_N + TRKA_N + RSK_N + APP_N + Bcatenin_N +
## SOD1_N + MTOR_N + P38_N + pMTOR_N + DSCR1_N + AMPKA_N + NR2B_N +
## pNUMB_N + RAPTOR_N + TIAM1_N + pP70S6_N + NUMB_N + P70S6_N +
## pGSK3B_N + pPKCG_N + CDK5_N + S6_N + ADARB1_N + AcetylH3K9_N +
## RRP1_N + BAX_N + ARC_N + nNOS_N + Tau_N + GFAP_N + GluR3_N +
## GluR4_N + IL1B_N + P3525_N + pCASP9_N + PSD95_N + SNCA_N +
## Ubiquitin_N + pGSK3B_Tyr216_N + SHH_N + BAD_N + BCL2_N +
## pS6_N + pCFOS_N + SYP_N + H3AcK18_N + EGR1_N + H3MeK4_N +
## CaNA_N
## Model 2: ERBB4_N ~ ARC_N + pGSK3B_Tyr216_N + AcetylH3K9_N + SYP_N + IL1B_N +
## TRKA_N + BAX_N + pGSK3B_N + BAD_N + BRAF_N + DSCR1_N + P3525_N +
## PSD95_N + GluR4_N + ERK_N + pAKT_N + pBRAF_N + MTOR_N + NR1_N +
## pNR2B_N + pMTOR_N + NR2B_N + DYRK1A_N + Tau_N + pCREB_N +
## SHH_N + pCASP9_N + pJNK_N + pERK_N + pCAMKII_N + pPKCG_N +
## pPKCAB_N + ADARB1_N + BDNF_N + RRP1_N + S6_N
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 476 0.0165
## 2 515 0.0172 -39 -0.000743 0.55 0.99
p value is 0.988. This mean that optimal model and full model have not difference then optimal model is better than the full one.
proteins_pca <- rda(proteins, scale = TRUE)
biplot(proteins_pca)
biplot(proteins_pca, scaling = "species", display = "species")
biplot(proteins_pca, scaling = "sites", display = "sites")
pca_summary <- summary(proteins_pca)
pca_result <- as.data.frame(pca_summary$cont)
plot_data <- as.data.frame(t(pca_result[c("Proportion Explained"),][1:24]))
plot_data$Component <- seq(1:nrow(plot_data))
ggplot(plot_data, aes(Component, `Proportion Explained`)) +
geom_bar(stat = "identity") +
scale_x_discrete(limits = seq(1:nrow(plot_data))) +
ylab('Proportion Explained (95 %)') +
xlab('Principal Component') +
theme_bw() +
ggtitle('Proportion Explained by Principal Component')
plot_data <- as.data.frame(pca_summary$sites[,1:3])
plot_ly(x = plot_data$PC1,
y = plot_data$PC2,
z = plot_data$PC3,
type = "scatter3d",
mode = "markers") %>%
layout(title = 'PCA 3D')
Form ExpressionSet:
proteins_dif <- df %>%
select(where(is.numeric))
proteins_dif <- normalizeQuantiles(log2(proteins_dif + 1))
row.names(proteins_dif) <- df$MouseID
factor_dif <- df['Genotype']
expr_data <- as.matrix(t(proteins_dif))
pheno_data <- df['Genotype']
row.names(pheno_data) <- df$MouseID
pheno_metadata <- data.frame(
labelDescription = c('Genotype'),
row.names = c('Genotype'))
pheno_data <- new("AnnotatedDataFrame",
data = pheno_data,
varMetadata = pheno_metadata)
feature_data <- data.frame(Protein = rownames(expr_data))
rownames(feature_data) <- rownames(expr_data)
feature_metadata <- data.frame(
labelDescription = c("Protein"),
row.names = c("Protein"))
f_data <- new("AnnotatedDataFrame",
data = feature_data,
varMetadata = feature_metadata)
experiment_data <-
new("MIAME",
name = "Katheleen J. Gardiner",
lab = "lab",
contact = "katheleen.gardiner@ucdenver.edu",
title = "Down syndrome detectable signals in the nuclear fraction of cortex",
abstract = "Abstract",
other = list(notes = "Dataset from Katheleen J. Gardiner et al. 2015"))
exp_set <-
ExpressionSet(assayData = expr_data,
phenoData = pheno_data,
featureData = f_data,
experimentData = experiment_data)
Linear model for limma:
X <- model.matrix(~ Genotype, pData(exp_set))
fit <- lmFit(exp_set, design = X, method = "robust", maxit = 1000)
efit <- eBayes(fit)
All differential-expressed proteins table with a = 0.05:
numGenes <- nrow(exprs(exp_set))
topTable(efit, number = numGenes) %>%
`rownames<-`(seq_len(numGenes)) %>%
filter(adj.P.Val < 0.05)
## Protein logFC AveExpr t P.Value adj.P.Val B
## 1 APP_N 0.1054 0.634 19.62 2.55e-76 1.96e-74 162.89613
## 2 Tau_N 0.0712 0.634 12.80 1.14e-35 4.38e-34 69.72518
## 3 ITSN1_N 0.0711 0.634 12.39 1.25e-33 3.21e-32 65.05321
## 4 S6_N 0.0703 0.634 12.12 2.78e-32 5.36e-31 61.97179
## 5 AcetylH3K9_N 0.0645 0.634 11.59 8.70e-30 1.34e-28 56.27899
## 6 pPKCG_N 0.0531 0.634 8.71 7.86e-18 1.01e-16 29.01689
## 7 DYRK1A_N 0.0507 0.634 8.51 4.06e-17 4.46e-16 27.39939
## 8 GluR3_N -0.0489 0.634 -8.31 2.15e-16 2.07e-15 25.75515
## 9 SYP_N -0.0460 0.634 -7.72 2.07e-14 1.77e-13 21.25617
## 10 MTOR_N -0.0455 0.634 -7.52 9.26e-14 7.13e-13 19.78595
## 11 AMPKA_N -0.0430 0.634 -7.17 1.20e-12 8.37e-12 17.27590
## 12 pGSK3B_N 0.0391 0.634 6.64 4.38e-11 2.81e-10 13.74767
## 13 pNR2A_N -0.0412 0.634 -6.61 5.52e-11 3.27e-10 13.51638
## 14 P38_N -0.0393 0.634 -6.38 2.36e-10 1.30e-09 12.09956
## 15 NR2B_N -0.0378 0.634 -6.19 7.58e-10 3.83e-09 10.96156
## 16 pCREB_N 0.0370 0.634 6.19 7.96e-10 3.83e-09 10.91563
## 17 BRAF_N 0.0348 0.634 5.86 5.73e-09 2.53e-08 8.99247
## 18 ARC_N -0.0343 0.634 -5.84 6.25e-09 2.53e-08 8.91154
## 19 pS6_N -0.0343 0.634 -5.84 6.25e-09 2.53e-08 8.91154
## 20 H3AcK18_N 0.0376 0.634 5.72 1.33e-08 5.10e-08 8.26580
## 21 IL1B_N -0.0317 0.634 -5.31 1.25e-07 4.47e-07 6.00462
## 22 pNR1_N -0.0329 0.634 -5.31 1.28e-07 4.47e-07 5.97713
## 23 pMTOR_N -0.0316 0.634 -5.14 3.08e-07 1.03e-06 5.12923
## 24 pGSK3B_Tyr216_N 0.0303 0.634 5.02 5.68e-07 1.82e-06 4.53705
## 25 pP70S6_N 0.0306 0.634 5.00 6.57e-07 2.02e-06 4.39986
## 26 NR2A_N -0.0292 0.634 -4.71 2.69e-06 7.97e-06 3.04040
## 27 CDK5_N 0.0267 0.634 4.47 8.53e-06 2.43e-05 1.93786
## 28 RAPTOR_N -0.0276 0.634 -4.44 9.81e-06 2.70e-05 1.80157
## 29 EGR1_N -0.0296 0.634 -4.28 1.97e-05 5.22e-05 1.24297
## 30 TIAM1_N 0.0257 0.634 4.16 3.37e-05 8.66e-05 0.62739
## 31 NUMB_N 0.0248 0.634 4.07 4.93e-05 1.22e-04 0.26650
## 32 H3MeK4_N 0.0281 0.634 3.97 7.62e-05 1.83e-04 -0.00279
## 33 pRSK_N 0.0240 0.634 3.92 9.25e-05 2.16e-04 -0.32679
## 34 pNR2B_N -0.0233 0.634 -3.84 1.26e-04 2.86e-04 -0.62141
## 35 SNCA_N -0.0232 0.634 -3.82 1.40e-04 3.08e-04 -0.72000
## 36 AKT_N -0.0216 0.634 -3.49 4.94e-04 1.06e-03 -1.90119
## 37 RRP1_N 0.0210 0.634 3.45 5.69e-04 1.18e-03 -2.03298
## 38 CAMKII_N -0.0194 0.634 -3.20 1.42e-03 2.88e-03 -2.87920
## 39 TRKA_N 0.0190 0.634 3.10 1.94e-03 3.84e-03 -3.16780
## 40 pELK_N 0.0181 0.634 2.93 3.47e-03 6.68e-03 -3.69993
## 41 NR1_N -0.0179 0.634 -2.88 3.99e-03 7.40e-03 -3.82627
## 42 P3525_N 0.0179 0.634 2.88 4.03e-03 7.40e-03 -3.83845
## 43 ADARB1_N -0.0175 0.634 -2.79 5.35e-03 9.57e-03 -4.09472
## 44 DSCR1_N 0.0157 0.634 2.56 1.06e-02 1.86e-02 -4.70565
## 45 GSK3B_N 0.0151 0.634 2.44 1.47e-02 2.51e-02 -4.99283
## 46 pCFOS_N -0.0139 0.634 -2.19 2.89e-02 4.83e-02 -5.54608
Let’s draw a heatmap:
my_list <- topTable(efit, coef = 2, n = 20)
dif_exp_set <- exp_set[fData(exp_set)$Protein %in% my_list$Protein, ]
dat <- as.matrix(exprs(dif_exp_set))
rownames(dat) <- fData(dif_exp_set)$Protein
colnames(dat) <- NULL
pal_green <- colorpanel(75, low = "black", mid = "darkgreen", high = "yellow")
heatmap.2(dat, col = pal_green, scale = "none", key=TRUE, symkey = FALSE, density.info = "none", trace = "none", cexRow = 0.9, cexCol = 1, margins = c(1, 6), keysize = 1, key.par = list(mar = c(2, 1, 2, 1)), key.xlab = '')
First 20 differential-expressed proteins in 1080 samples